*===============================================================*
* Programmer: James Nguyen, US EPA                              *
*                                                               *
* Project: Power and effect size inflation in epi. studies      *
*                                                               *
* Purpose:                                                      *
*           - applicable to studies where RR is estimated       *
*           - estimates power of the study design for testing:  *
*               + true RR > 1 with a significance level = alpha *
*                 (i.e., the lower bound of (100*(1-alpha)      *
*                 confidence interval) > 1                      *
*               + true RR < 1 with a significance level = alpha *
*                 (i.e., the upper bound of (100*(1-alpha)      *
*                 confidence interval) < 1                      *
*           - characterizes the distribution of observed        *
*             significant RRs                                   *
*                                                               *
* Assumptions:                                                  *
*        - time of follow-up is different between subjects      *
*          and the distributions of follow-up time are          *
*          different between studies - these typically are not  *
*          well characterized in the publications.              *
*                                                               *
*        - For simplicity, we assume all subjects have          *
*          same time of follow-up.  This assumption typically   *
*          over-estimates the power of the study design.        *
*                                                               *
* Notes: the DM statement in the Macro clears all the SAS       *
*        output, SAS log, and the Results.  Some WARNING and    *
*        ERROR messages can occur in the log.                   *
*                                                               *
*      - When the parameter EXACT = N (i.e., the Empirical      *
*        Standard Error Estimates is used), WARNING and ERROR   *
*        messages can occur in the SAS log for some datasets    *
*        for which one group has 0 values for all subjects).    *
*        The EXACT Test is used for these datasets.             *
*                                                               *
*      - All datasets for which both groups have 0 values for   *
*        all subjects are not included in the data analysis.    *
*                                                               *
*      - This Macro is the revision of the code version         *
*        revised in 10/2019                                     *
*                                                               *
* Date:  10/2021                                                *
*==============================================================*;
option nodate nonumber formDlim="=" ls=100 ps=100 nomprint;

%let junk=C:\Users\JNguyen\Desktop\Junks;

%Macro ESM_Rate(R0=, 			/* rates of disease in reference/control group                          */
				RR=, 			/* true rate ratios or relative risks                                   */
				N0=, 			/* sample size of control group                                         */
				N1=,   			/* sample size of exposure group                                        */
				Exact=,			/* if "Exact = Y" then Exact test for all datasets                		*/
					 			/* if "Exact = N" then Exact test only for non-convergent datasets 		*/
				alpha=,			/* level of significance                                                */
				pctiles=,		/* percentiles of significant ORs: 2.5, 5, 10, 25, 50, 75, 90, 95, 97.5 */
				ipctiles=,		/* percentiles that need to calculate inflation factors           		*/
				NSim=,          /* number of iterations, i.e. datasets                                  */
				seed=,			/* seed number to reproduce random numbers                              */
				outfile=);  

	*===> count number of control rates;
	%let R0N=1;
	%let R0&R0N = %nrbquote(%scan(&R0,&R0N, %str( )));
	%do %while (&&R0&R0N ^=);
		%let R0N=%eval(&R0N+1);
		%let R0&R0N = %nrbquote(%scan(&R0,&R0N, %str( )));
	%end;
	%let R0N=%eval(&R0N-1);

	*===> count number of RRs;
	%let RN=1;
	%let RR&RN = %nrbquote(%scan(&RR,&RN, %str( )));
	%do %while (&&RR&RN ^=);
		%let RN=%eval(&RN+1);
		%let RR&RN = %nrbquote(%scan(&RR,&RN, %str( )));
	%end;
	%let RN=%eval(&RN-1);

	*===> count the percentiles that need to calculate the inflation factors;
	%let ipn=1;
	%let ip&ipn = %nrbquote(%scan(&ipctiles,&ipn, %str( )));
	%do %while (&&ip&ipn ^=);
		%let ipn=%eval(&ipn+1);
		%let ip&ipn = %nrbquote(%scan(&ipctiles,&ipn, %str( )));
	%end;
	%let ipn=%eval(&ipn-1);

	title;

	*===> create a dataset of control rates and RRs;
	Data R_RR; set _NULL_; run;
	%do j = 1 %to &R0N;
		%do k=1 %to &RN;
			Data New; 
				*==> rate of disease in reference group;
				R0=&&R0&j; 
				true_RR=&&RR&k; 
				*==> rate of disease in compared group;
				R1 = true_RR*R0;
				N0 = &N0;
				N1 = &N1;
			run;
			Data R_RR; set R_RR New; run;
			Proc datasets nolist; delete New; quit;
		%end; *k;
	%end; *j;

	Proc sort data = R_RR; 	by R0 R1 true_RR N0 N1; run;

	*===> create datasets with Total number of cases;
	Data Sim;
		call streaminit(&seed);
		set R_RR;
		do group = 1 to 2;
			Rate = (group=1)*R0 + (group=2)*R1;
			N = (group=1)*N0 + (group=2)*N1;

			*==> for the sake of simplicity, follow-up time is set = 1 for everyone;
			*==> Total Cases;
			%do Sim = 1 %to &NSim;
				Total&Sim = rand("Poisson", N*Rate);
			%end; *sim;
			output;
		end; *group;
		drop Rate;	
	run;
	*===> create datasets with individual case-subject data;
	Data Sim;
		set Sim;
		do i = 1 to N;
			%do Sim = 1 %to &NSim;
				*==> case for the i-th subject at each Sim;
				Case&Sim = (Total&Sim >= i);
			%end; *sim;
			output;
		end;
		drop N Total:;
	run;

	Data All; set _NULL_;  SCOL=.;run;

	ods listing close;
	ods select none;

	*===> use generalzied linear model to estimate RRs and their p-values;
	%do i = 1 %to &NSim;

		Proc SQL;
			create table maxSim&i as
			select R0, R1, true_RR, N0, N1, max(case&i) as max&i
			from Sim
			group by R0, R1, true_RR, N0, N1
			order by R0, R1, true_RR, N0, N1;
		quit;

		Data Sim&i;
			merge Sim(keep= R0 R1 true_RR N0 N1 group i case&i) maxSim&i;
			by  R0 R1 true_RR N0 N1;
			if max&i > 0;
			drop max&i;
		run;

		Proc datasets nolist; delete maxSim&i; quit;

		Proc SQL noprint;
			select count(*) into: Nrow
			from Sim&i;
		quit;
		%let Nrow=&Nrow;

		%if &Nrow ^= 0 %then %do;

			%if &Exact = Y %then %do;
				ods output ExpExactParmEst=Exact(drop=CISides Note Parameter ClassVal0 alpha);
				Proc GENMOD data = Sim&i;
					by R0 R1 true_RR N0 N1;
					class group(ref="1")/param=ref ;
					model case&i = group/dist=poisson link=log alpha=&alpha;
					exact group/estimate=odds alpha=&alpha;
				run;

				Data All; set All Exact(in=new); if new then Sim = &i;	run;

				Proc datasets nolist; delete Exact Sim&i; quit;
			%end; *Exact = Y;

			%if &Exact = N %then %do;

				Data NPower; set _NULL_; run;

				ods output estimates= NPower(keep=R0 R1 true_RR N0 N1 MeanEstimate ProbChiSq MeanLowerCL MeanUpperCL);
				Proc GENMOD data = Sim&i;
					by R0 R1 true_RR N0 N1;
					class group i;
					model case&i = group/dist=poisson link=log;
					*==> use a sandwich estimator;
					repeated subject=i(group)/type=unstr;
					estimate "group: 2 vs. 1" group -1 1/alpha=&alpha; 
				run;

				Proc SQL noprint;
					select count(*) into: Nrow1
					from NPower;
				quit;
				%let Nrow1=&Nrow1;


				%if &Nrow1 = 0 %then %do;
					ods output ExpExactParmEst=NExact(drop=CISides Note Parameter ClassVal0 alpha);
					Proc GENMOD data = Sim&i;
						by R0 R1 true_RR N0 N1;
						class group(ref="1")/param=ref ;
						model case&i = group/dist=poisson link=log;
						exact group/estimate=odds alpha=&alpha;
					run;

					Data All; 
						set All NExact(in=new); if new then Sim = &i;	run;
					run;

					Proc datasets nolist; delete NExact; quit;
				%end;*Nrow1 = 0;


				%if &Nrow1 ^= 0 %then %do;

					Data NPower;
						set NPower;
						Estimate=MeanEstimate;
						LowerCL=MeanLowerCL;
						UpperCL=MeanUpperCL;
						Pvalue=ProbChiSq;
						drop MeanEstimate MeanLowerCL MeanUpperCL ProbChiSq;
					run;

					Data All; set All NPower(in=new); if new then Sim = &i;	run;


					*==> create datasets that one group has case = 0 for all subjects;

					Data Sim&i;
						merge Sim&i NPower(in=s keep = R0 R1 true_RR N0 N1);
						by R0 R1 true_RR N0 N1;
						if s = 0;
					run;

					Proc SQL noprint;
						select count(*) into: Nrow2
						from Sim&i;
					quit;
					%let Nrow2=&Nrow2;

					%if &Nrow2 ^= 0 %then %do;
						ods output ExpExactParmEst=NExact(drop=CISides Note Parameter ClassVal0 alpha);
						Proc GENMOD data = Sim&i;
							by R0 R1 true_RR N0 N1;
							class group(ref="1")/param=ref ;
							model case&i = group/dist=poisson link=log;
							exact group/estimate=odds alpha=&alpha;
						run;
						Data All; 
							set All NExact(in=new); if new then Sim = &i;	run;
						run;
						Proc datasets nolist; delete NExact; quit;
					%end;*Nrow2;

				%end; *Nrow1 ^= 0;

				Proc datasets nolist; delete Sim&i NPower; quit;

			%end; *Exact = N;

		%end; *Nrow;

		DM "log;clear; output;clear; odsresults;clear";

	%end; *i;
	
	Proc datasets nolist; delete  Sim; quit;

	Proc SQL noprint;
		select count(*) into: NAll
		from All;
	quit;
	%let NAll=&NAll;


	%if &NAll = 0 %then %do;

		ods listing;
		ods select default;

		title;
		%if &outfile ^= %then %do; ods rtf file = "&outfile" startpage=no bodytitle; %end;
		data _null_;
		    file print;
		    put "All subjects have same outcome value.  None of the datasets were valid for analysis";
		    file log;
		run;
		%if &outfile ^= %then %do; 	ods rtf close; %end;
	%end; *NAll = 0;

	%if &NAll ^= 0 %then %do;
		Data All;
			set All;
			if      true_RR > 1 and LowerCL > 1 then do;
				Significance = 1;
				Rate_Ratio = Estimate;
			end;
			else if true_RR < 1 and UpperCL > 1 then do;
				Significance = 1;
				Rate_Ratio = Estimate;
			end;
			else do;
				Significance = 0;
				Rate_Ratio = .;
			end;			
		run;

		Proc sort data = All; by R0 R1 true_RR N0 N1; run;

		Proc SQL noprint;
			create table Power as
				select R0, R1, true_RR, N0, N1, count(*) as valid, round(avg(Significance),0.001) as Power
				from All
				group by R0, R1, true_RR, N0, N1
				order by R0, R1, true_RR, N0, N1;

			select count(*) into : NPower1
				from All
				where significance = 1;
			%let NPower1 = &NPower1;

		quit;

		%if &NPower1 > 0 %then %do;
			Proc univariate data =All noprint;
				where significance = 1;
				by R0 R1 true_RR N0 N1;
				var Rate_Ratio;
				output out = SigRRs pctlpts=&pctiles pctlpre= pctl;
			run;

			Data SigRRs;
				set SigRRs;
				%do v = 1 %to &ipn;
					%if &&ip&v = 2.5 	%then %do; infl_P025 = pctl2_5/true_RR; %end;
					%if &&ip&v = 5 		%then %do; infl_P05  = pctl5/true_RR;   %end;
					%if &&ip&v = 10 	%then %do; infl_P10  = pctl10/true_RR;  %end;
					%if &&ip&v = 25 	%then %do; infl_P25  = pctl25/true_RR;  %end;
					%if &&ip&v = 50 	%then %do; infl_P50  = pctl50/true_RR;  %end;
					%if &&ip&v = 75 	%then %do; infl_P75  = pctl75/true_RR;  %end;
					%if &&ip&v = 90 	%then %do; infl_P90  = pctl90/true_RR;  %end;
					%if &&ip&v = 95 	%then %do; infl_P95  = pctl95/true_RR;  %end;
					%if &&ip&v = 97.5 	%then %do; infl_P975 = pctl97_5/true_RR; %end;
				%end;
			run;

			Data ESM;
				merge Power SigRRs R_rr;
				by R0 R1 true_RR N0 N1;
				if valid = . then valid = 0;
			run;
		%end;
		%if &NPower1 = 0 %then %do;
			Data ESM;
				merge Power R_rr;
				by R0 R1 true_RR N0 N1;
				if valid = . then valid = 0;
			run;
		%end;

		ods listing;
		ods select default;

		%if &outfile ^= %then %do; ods rtf file = "&outfile" startpage=no bodytitle; %end;

		%if &RR1 > 1 %then %do;
			title "Power and distribution of observed RRs with lower bound of %sysevalf(100*(1-&alpha))% CI > 1";
			title2 	%if &Exact = Y %then %do; "Exact Test was used for all datasets"; %end;
					%if &Exact = N %then %do; "Exact Test was used only for non-convergent datasets"; %end;;
		%end;
		%if &&RR&RN < 1 %then %do;
			title "Power and distribution of observed RRs with upper bound of %sysevalf(100*(1-&alpha))% CI < 1";
			title2 	%if &Exact = Y %then %do; "Exact Test was used for all datasets"; %end;
					%if &Exact = N %then %do; "Exact Test was used only for non-convergent datasets"; %end;;
		%end;
		%if &RR1 < 1 and &&RR&RN > 1 %then %do;
			title "Power and distribution of observed RRs with:";
			title2 "the upper bound of %sysevalf(100*(1-&alpha))% CI < 1 if true RR < 1";
			title3 "or the lower bound of %sysevalf(100*(1-&alpha))% CI > 1 if true RR > 1";
			title4 	%if &Exact = Y %then %do; "Exact Test was used for all datasets"; %end;
					%if &Exact = N %then %do; "Exact Test was used only for non-convergent datasets"; %end;;
		%end;


		Proc Print data = ESM noobs label; format PCTL: infl: 6.3; label valid = "# valid datasets"; run;
		%if &outfile ^= %then %do; 	ods rtf close; %end;

	%end; *NAll ^= 0;


%Mend;

%ESM_Rate(R0=0.005618 0.011237 0.022473, 
		  RR=1.2 1.5 2.0 3.0, 
		  N0=17710, N1=1710, Exact = N, alpha=0.1,
		  pctiles=10 50 90,
		  ipctiles=50,
		  NSim=50, seed=2412544,
		  outfile=C:\Users\JNguyen\Desktop\Junks\example on page 18 of the Working paper 2019.rtf);